July 30, 2015

Data aquisition

  • FASTQ files are aligned to ce10 reference genome using BWA
  • Tags are summarized using coding gene model (only exonic reads) and ERCC spike-ins
  • SummmarizedExperiment structures with annotations are acquired from JADB
ContactExpIDs <- paste0("rFB0", 26:32)
require(JADBtools)
require(GenomicAlignments)
se <- getSummarizedEperiment(ContactExpIDs) 
rownames(se)[rownames(se) == ''] <- 
    as.character(elementMetadata(se)$geneName[rownames(se) == ''])
colnames(se) <- sapply(strsplit(colnames(se), '_'), '[[', 10)
show(se)
## class: SummarizedExperiment 
## dim: 20633 7 
## exptData(0):
## assays(1): counts
## rownames(20633): WBGene00000001 WBGene00000002 ... ERCC-00170
##   ERCC-00171
## rowRanges metadata column names(3): geneName desc seqID
## colnames(7): rFB026 rFB027 ... rFB031 rFB032
## colData names(2): strain stage

Get spike-ins annotations

tab <- read.table(
    system.file('spikeIN/cms_095046.txt', package = 'JADBtools'), 
    sep = '\t', header = TRUE, row.names = 2
)
attomole <- 602200
infoIN <- unlist(getDBdataField(paste0("rFB0", 26:32), field = 'SpikeIN'))
str(tab)
## 'data.frame':    92 obs. of  6 variables:
##  $ Re.sort.ID                           : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ subgroup                             : Factor w/ 4 levels "A","B","C","D": 1 1 1 1 1 1 1 1 1 1 ...
##  $ concentration.in.Mix.1..attomoles.ul.: num  30000 7500 1875 938 469 ...
##  $ concentration.in.Mix.2..attomoles.ul.: num  7500 1875 469 234 117 ...
##  $ expected.fold.change.ratio           : num  4 4 4 4 4 4 4 4 4 4 ...
##  $ log2.Mix.1.Mix.2.                    : num  2 2 2 2 2 2 2 2 2 2 ...
infoIN
## rFB026.SpikeIN rFB027.SpikeIN rFB028.SpikeIN rFB029.SpikeIN rFB030.SpikeIN 
##   "Spike-in 1"   "Spike-in 2"   "Spike-in 1"   "Spike-in 2"   "Spike-in 1" 
## rFB031.SpikeIN rFB032.SpikeIN 
##   "Spike-in 2"   "Spike-in 1"

Construct DESeqDataSet and plot PCA

require(DESeq2); require(magrittr); require(ggplot2)
dds <- DESeqDataSet(se, ~strain)
plotPCA( 
    DESeqTransform( se ), intgroup = c("strain", "stage"), 
    ntop=Inf, returnData = TRUE 
)  %>% ggplot(., aes(PC1, PC2, col=strain, label=name, shape=stage))  + 
    geom_text(vjust=1.2) + geom_point(size=3) -> p; print(p)

Calculate RPKM and gater data in plotable format

require(reshape2); require(dplyr);
rpkm <- fpkm(dds, robust = FALSE)
count <- melt(
    rpkm[grep('ERCC', rownames(rpkm)),], 
    varnames = c('spike','smpl'), value.name = 'count'
)

tab <- tab[grep('ERCC', rownames(rpkm), value = TRUE),]
cont <- melt( 
    tab[,grepl('2', infoIN)+3]*attomole,
    varnames = c('mix', 'spike'),value.name = 'cont'
)
data <- cbind( count, cont )
data  %>% mutate(cont=log2(cont), count=log2(count))  %>% 
    filter(!is.infinite(count)) -> data_log
head(data_log, 3)
##        spike   smpl    count                              variable
## 1 ERCC-00002 rFB026 6.907147 concentration.in.Mix.1..attomoles.ul.
## 2 ERCC-00003 rFB026 4.387187 concentration.in.Mix.1..attomoles.ul.
## 3 ERCC-00004 rFB026 7.319360 concentration.in.Mix.1..attomoles.ul.
##       cont
## 1 33.07256
## 2 29.07256
## 3 32.07256

Plot spike-in regressions

plotSpikeIN(data_log)

Plot spike-in regressions (RPKM>1)

data_log  %>% filter(count > 0)  %>% plotSpikeIN(xpos = 30, ypos = 0)

Calculate regression coefficients and p-values (from F statistics)

library(plyr)
data_log  %>% filter(count > 0)  %>% ddply(.(smpl), function(df) {
    m <- lm(count ~ cont, data=df)
    data.frame(a = coef(m)[1], b = coef(m)[2], pval=anova(m)$'Pr(>F)'[[1]] )
}) -> coefs
coefs
##     smpl         a         b         pval
## 1 rFB026 -24.13285 0.9604284 1.580335e-13
## 2 rFB027 -24.80886 0.9825533 1.002610e-13
## 3 rFB028 -24.42605 0.9797744 4.223479e-15
## 4 rFB029 -25.18550 0.9758320 2.173946e-10
## 5 rFB030 -22.74076 0.8926094 1.146701e-09
## 6 rFB031 -26.27531 1.0056719 4.739706e-10
## 7 rFB032 -22.86034 0.9141125 1.940886e-11

Normalizing for library depth, part 1

  • Using all reads
counts(dds, normalized = FALSE, replaced = FALSE)  %>% colSums -> read_sums
read_sums/mean(read_sums)
##    rFB026    rFB027    rFB028    rFB029    rFB030    rFB031    rFB032 
## 1.1883245 1.2098300 1.0391636 0.7534038 0.8308662 1.1542016 0.8242103
  • Using spike-in reads
counts(dds, normalized = FALSE, replaced = FALSE)[grep('ERCC', rownames(dds)),]  %>% colSums -> spike_sums
spike_sums/mean(spike_sums)
##    rFB026    rFB027    rFB028    rFB029    rFB030    rFB031    rFB032 
## 1.3435317 1.4469372 1.4375316 0.5999691 0.5424518 0.8161859 0.8133927

Normalizing for library depth, part 2

  • using the "median ratio method" described by Equation 5 in Anders and Huber (2010) for all reads
dds %>% estimateSizeFactors  %>% sizeFactors -> read_sf
read_sf/mean(read_sf)
##    rFB026    rFB027    rFB028    rFB029    rFB030    rFB031    rFB032 
## 1.1674125 1.1260571 0.8023782 0.8284577 0.8944839 1.2775354 0.9036754
  • using the "median ratio method" described by Equation 5 in Anders and Huber (2010) for spike-in reads
dds[grep('ERCC', rownames(dds)),] %>% estimateSizeFactors  %>% 
    sizeFactors -> spike_sf
spike_sf/mean(spike_sf)
##    rFB026    rFB027    rFB028    rFB029    rFB030    rFB031    rFB032 
## 1.2575807 1.6114569 1.4078870 0.6055446 0.5049380 0.8923032 0.7202895

Plot as barplot (normalized to 1)

dat <- data.frame(read_sums, spike_sums, read_sf, spike_sf)
barplot( 
    t(as.matrix(dat))/colMeans(as.matrix(dat)), 
    beside=TRUE, legend.text=names(dat) 
)

Plot as barplot (normalized to 1)

dat <- data.frame(read_sums, spike_sums, read_sf, spike_sf)
barplot( 
    t(as.matrix(dat))/colMeans(as.matrix(dat)), 
    beside=TRUE, legend.text=names(dat) 
)

Distribution of data in sampls

rpkm  %>% melt(value.name = 'x', varnames = c('gene', 'smpl'))  %>% 
    ggplot(aes(smpl, log2(x), col=smpl)) +  geom_boxplot(alpha=0.7)  + 
    geom_violin(alpha=0.5) + ylab('Expression [log2(RPKM)]') + xlab('Experiment')

Distribution of spike-ins - plot 1

spikes <- grep("^ERCC", rownames(rpkm), value = TRUE)
infoIN  %>%  sapply(rep, spikes %>% length)  %>% melt(value.name = 'Mix')  %>% 
    dplyr::select(Mix)  %>% 
    cbind(melt(rpkm[spikes,]), subgroup=tab[spikes,]$subgroup, .)  %>% 
    ggplot(aes(x=Var1, y=log2(value), col=subgroup, shape=Mix)) + 
    geom_point(size=4) + scale_shape_manual(values = c(20, 2)) + 
    theme(axis.text.x = element_text(angle = 90, hjust = 1))

Distribution of spike-ins - plot 2

infoIN  %>%  sapply(rep, spikes %>% length)  %>% melt(value.name = 'Mix')  %>% 
    dplyr::select(Mix)  %>% cbind(melt(rpkm[spikes,]), 
    subgroup=tab[spikes,]$subgroup, ., rat=tab[spikes,]$log2.Mix.1.Mix.2.)  %>% 
    ggplot(aes(x=rat, y=log2(value), col=subgroup, shape=Mix)) + 
    geom_violin(alpha=0.5) + geom_boxplot(alpha=0.5) +
    geom_jitter(size=4) + scale_shape_manual(values = c(20, 2))

Colecting data for RUV normalization

zfGenes <- assays(se)$counts
zfGenes[spikes,infoIN=="Spike-in 2"] <- 
    zfGenes[spikes,infoIN=="Spike-in 2"]*tab[spikes,]$expected.fold.change.ratio
filter <- apply(zfGenes, 1, function(x) length(x[x>5])>=2)
filtered <- zfGenes[filter,]
genes <- rownames(filtered)[grep("^WB", rownames(filtered))]
spikes <- rownames(filtered)[grep("^ERCC", rownames(filtered))]
f <- factor(infoIN); levels(f) <- c('mix1', 'mix2');
colData(se)$mix <- f
colData(se)$spike <- colSums(zfGenes[spikes,])
colData(se)$sf <- zfGenes  %>% estimateSizeFactorsForMatrix
require(Heatplus)

Spike-in expression w/o mix scaleng

plot(annHeatmap(log2(assays(se)$counts[spikes,]+1), ann=colData(se), cluster=list(cuth=10)))

Spike-in expression witch mix scaleng

plot(annHeatmap(log2(zfGenes[spikes,]+1), ann=colData(se), cluster=list(cuth=10)))

Define PCA plotting fun

myPCA <- function(x) {
    x %>% DESeqTransform  %>% 
    plotPCA(intgroup = c("strain", "stage"), ntop=Inf, returnData = TRUE )  %>% 
    ggplot(., aes(PC1, PC2, col=strain, label=name, shape=stage))  + 
    geom_text(vjust=1.2) + geom_point(size=3)
}

PCA plot, unnormalized data

se[genes]  %>% myPCA

PCA plot, VST

se[genes]  %>% DESeqDataSet(~strain)  %>% DESeq -> ddsN
ddsN %>% varianceStabilizingTransformation %>% myPCA

PCA plot, normalized counts (DEseq2)

log2(counts(ddsN, normalized=TRUE) + 1) %>% 
    SummarizedExperiment(colData=colData(ddsN))  %>% myPCA

RUVg normalization using spike-in only

library(RUVSeq); library(EDASeq); library(RColorBrewer)
colData(se)$mix=NULL; colData(se)$sf=NULL
ruv <- RUVg(round(zfGenes), spikes, k=1)

differences <- matrix(data=c(c(1,2,4,5), c(3,6,7,-1)), byrow=TRUE, nrow=2)
ruvs <- RUVs(round(zfGenes), genes, k=1, differences)

require(edgeR)
design <- model.matrix(~strain, data=colData(se))
y <- DGEList(counts=assays(se)$counts[genes,], group=colData(se)$strain)
y <- calcNormFactors(y, method="upperquartile")
y <- estimateGLMCommonDisp(y, design)
y <- estimateGLMTagwiseDisp(y, design)
fit <- glmFit(y, design)
res <- residuals(fit, type="deviance")
ruvr <- RUVr(round(zfGenes), genes, k=1, res)

myHeatmap <- function(d, ...) {
    results(d) %>% .[order(.$padj),]  %>% head(50)  %>% rownames %>%
    counts(d)[.,]   %>% '+'(1)  %>% log2  %>% 
    annHeatmap(ann=colData(d), cluster=list(cuth=15, col=colors), ...)  %>% plot
}

colors <- brewer.pal(3, "Set2")

Impact of RUVg

par(mfrow=c(1,2))
plotRLE( zfGenes[genes,], col=colors[colData(se)[[1]]] )
plotRLE( ruv$normalizedCounts[genes,], col=colors[colData(se)[[1]]] )

Impact of RUVg

par(mfrow=c(1,2))
plotPCA( zfGenes[genes,], col=colors[colData(se)[[1]]] )
plotPCA( ruv$normalizedCounts[genes,], col=colors[colData(se)[[1]]] )

PCA plot, RUVg and VSN

ruv$normalizedCounts[genes,]  %>% SummarizedExperiment(colData=colData(se))  %>%
    DESeqDataSet(~strain) %>% varianceStabilizingTransformation %>% myPCA

Impact of RUVs

par(mfrow=c(1,2))
plotRLE( zfGenes[genes,], col=colors[colData(se)[[1]]] )
plotRLE( ruvs$normalizedCounts[genes,], col=colors[colData(se)[[1]]] )

Impact of RUVs

par(mfrow=c(1,2))
plotPCA( zfGenes[genes,], col=colors[colData(se)[[1]]] )
plotPCA( ruvs$normalizedCounts[genes,], col=colors[colData(se)[[1]]] )

PCA plot, RUVs and VSN

ruvs$normalizedCounts[genes,]  %>% SummarizedExperiment(colData=colData(se))  %>%
    DESeqDataSet(~strain) %>% varianceStabilizingTransformation %>% myPCA

Impact of RUVr

par(mfrow=c(1,2))
plotRLE( zfGenes[genes,], col=colors[colData(se)[[1]]] )
plotRLE( ruvr$normalizedCounts[genes,], col=colors[colData(se)[[1]]] )

Impact of RUVr

par(mfrow=c(1,2))
plotPCA( zfGenes[genes,], col=colors[colData(se)[[1]]] )
plotPCA( ruvr$normalizedCounts[genes,], col=colors[colData(se)[[1]]] )

PCA plot, RUVr and VSN

ruvr$normalizedCounts[genes,]  %>% SummarizedExperiment(colData=colData(se))  %>%
    DESeqDataSet(~strain) %>% varianceStabilizingTransformation %>% myPCA

Differential expression, raw data

zfGenes[genes,]  %>% DESeqDataSetFromMatrix(colData(se), ~strain)  %>% DESeq -> dds
dds  %>% myHeatmap

results(dds)  %>% .$padj  %>% '<'(0.05)  %>% table
## .
## FALSE  TRUE 
## 14284     2

Differential expression, RUVg

ruv$normalizedCounts[genes,]  %>% DESeqDataSetFromMatrix(colData(se), ~strain)  %>% DESeq -> ddsG
ddsG %>% myHeatmap

results(ddsG)  %>% .$padj  %>% '<'(0.05)  %>% table
## .
## FALSE  TRUE 
## 14313  1634

Differential expression (edgeR), RUVg

design <- model.matrix(~strain + w, data=data.frame(colData(se), w=as.numeric(ruv$W)))
y <- DGEList(counts=assays(se)$counts[genes,], group=colData(se)$strain)
y <- calcNormFactors(y, method="upperquartile")
y <- estimateGLMCommonDisp(y, design)
y <- estimateGLMTagwiseDisp(y, design)
fit <- glmFit(y, design)
lrt <- glmLRT(fit, coef=2)

Differential expression (edgeR), RUVg

topTags(lrt, 50) %>% rownames %>% ruv$normalizedCounts[.,]   %>% '+'(1)  %>% 
    log2  %>% annHeatmap(ann=colData(se), cluster=list(cuth=15, col=colors))  %>% plot

Control: stage DE with raw counts

se[genes,]  %>% DESeqDataSet(~stage)  %>% DESeq %>% myHeatmap

Control: stage DE with RUVg counts

ruv$normalizedCounts[genes,]  %>% DESeqDataSetFromMatrix(colData(se), ~stage)  %>% DESeq %>% myHeatmap

Control: strain DE with RUVs counts

ruvs$normalizedCounts[genes,]  %>% DESeqDataSetFromMatrix(colData(se), ~strain)  %>% DESeq %>% myHeatmap

Control: strain DE with RUVr counts

ruvr$normalizedCounts[genes,]  %>% DESeqDataSetFromMatrix(colData(se), ~strain)  %>% DESeq %>% myHeatmap

Control: strain DE with SPIKE counts

ruvr$normalizedCounts[genes,]  %>% DESeqDataSetFromMatrix(colData(se), ~strain)  %>% DESeq %>% myHeatmap

Control: spike counts vs. RUVr coefs

ruvr$normalizedCounts[genes,]  %>% DESeqDataSetFromMatrix(colData(se), ~strain)  %>% DESeq %>% myHeatmap

Control: spike counts vs. RUVr coefs

data.frame(spike_sums, ruv$W, names(spike_sums))  %>% 
    ggplot(aes((spike_sums), W_1, label=names.spike_sums.)) + geom_text() + 
    geom_point() + geom_smooth(method=lm) + scale_y_reverse()

Control: spike coefs vs. RUVr coefs

melt(
    assays(se)$counts[grep('ERCC', rownames(rpkm)),], 
    varnames = c('spike','smpl'), value.name = 'count'
)  %>% cbind( cont ) -> data_raw

data_raw %>% filter(count > 10)  %>% ddply(.(smpl), function(df) {
    m <- lm(cont ~ count, data=df)
    data.frame(a = coef(m)[1], b = coef(m)[2], pval=anova(m)$'Pr(>F)'[[1]] )
}) -> coef_raw

Control: spike coefs vs. RUVr coefs

plotSpikeIN(data_raw  %>% filter(count > 10), xpos =8.5, ypos=4) + 
    scale_x_log10() + scale_y_log10()

Control: spike coefs vs. RUVr coefs

data.frame(spike_sums, coef_raw, ruv$W, names(spike_sums)) %>% 
     ggplot(aes(b/mean(b), W_1, label=names.spike_sums.)) + geom_text() + 
     geom_point() + geom_smooth(method=lm)

Spike-in sums normalization

( filtered * (rep(1, nrow(filtered)) %*% t((spike_sums/1e3)/(read_sums/1e6))) )  %>%  
    "+"(1) %>% log2 %>% SummarizedExperiment(colData=colData(se))  %>% myPCA

Spike-in sums normalization

( filtered * (rep(1, nrow(filtered)) %*% t((spike_sums/1e3)/(read_sums/1e6))) ) %>% 
    round %>% DESeqDataSetFromMatrix(colData(se), ~strain)  %>% DESeq %>% myHeatmap

Linear model normalization

data_raw %>% filter(count > 10)  %>% ddply(.(smpl), function(df) {
    ids <- as.character(unique(df$smpl))
    m <- lm(cont ~ count, data=df)
    out <- predict(m, data.frame(count=as.integer(assays(se[genes,ids])$counts) ))
    return(out)
} ) %>% .[-1]  %>% as.matrix  %>% t -> norm_coef
#norm_coef <- round( norm_coef )
#colnames(norm_coef) <-  colnames(se)

Spike-in Linear model

(norm_coef/attomole) %>% SummarizedExperiment(colData=colData(se))  %>% myPCA

Spike-in Linear model

(norm_coef/attomole) %>% round %>% DESeqDataSetFromMatrix(colData(se), ~strain)  %>% DESeq %>% myHeatmap

late embryo only

LE <- se[,colData(se)$stage == 'LE']

filter <- apply(zfGenes[,colnames(LE)], 1, function(x) length(x[x>5])>=2)
filtered <- zfGenes[,colnames(LE)][filter,]
genes <- rownames(filtered)[grep("^WB", rownames(filtered))]
spikes <- rownames(filtered)[grep("^ERCC", rownames(filtered))]

ruv_le <- RUVg(round(zfGenes[,colnames(LE)]), spikes, 1)

differences <- rbind(c(1,2), c(3,4))
ruvs_le <- RUVs(round(zfGenes[genes,colnames(LE)] ), genes, k=1, differences)

late embryo only RUVg

par(mfrow=c(1,2))
zfGenes[genes,colnames(LE)] %>% plotPCA(col=colors[colData(LE)[[1]]])
ruv_le$normalizedCounts[genes,]  %>% plotPCA(col=colors[colData(LE)[[1]]])

late embryo only RUVs

par(mfrow=c(1,2))
LE[genes,]  %>% assays  %>% .$counts  %>% plotPCA(col=colors[colData(LE)[[1]]])
ruvs_le$normalizedCounts[genes,]  %>% plotPCA(col=colors[colData(LE)[[1]]])

late embryo only RUVs

LE[genes,]  %>% DESeqDataSet(~strain)  %>% DESeq  %>% myHeatmap

late embryo only RUVs

ruv_le$normalizedCounts[genes,] %>% DESeqDataSetFromMatrix( colData(LE), ~strain)  %>% DESeq  %>% myHeatmap

late embryo only RUVs

ruvs_le$normalizedCounts[is.na(as.integer(ruvs_le$normalizedCounts))] <- 0 
ruvs_le$normalizedCounts[genes,] %>% DESeqDataSetFromMatrix( colData(LE), ~strain)  %>% DESeq  %>% myHeatmap